{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Query real time data from DingXiangYuan, and keep the latest records every day for each city\n",
    "\n",
    "## Created on Sat Feb  8 12:41:50 2020\n",
    "## Author: leebond\n",
    "#### resource: https://github.com/jianxu305/nCov2019_analysis\n",
    "\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import pandas as pd\n",
    "import pickle as pkl\n",
    "import numpy as np\n",
    "import math\n",
    "import datetime\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')\n",
    "from googletrans import Translator # package used to translate Chinese into English\n",
    "translator = Translator()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2020-02-14 18:18:14Update records successfully to ../data/DXY_Chinese.csv\r\n"
     ]
    }
   ],
   "source": [
    "## Query the latest data\n",
    "! python dataset.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>date</th>\n",
       "      <th>country</th>\n",
       "      <th>countryCode</th>\n",
       "      <th>province</th>\n",
       "      <th>city</th>\n",
       "      <th>confirmed</th>\n",
       "      <th>suspected</th>\n",
       "      <th>cured</th>\n",
       "      <th>dead</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>10541</th>\n",
       "      <td>2020-02-14</td>\n",
       "      <td>美国</td>\n",
       "      <td>US</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>15</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10542</th>\n",
       "      <td>2020-02-14</td>\n",
       "      <td>越南</td>\n",
       "      <td>VN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>16</td>\n",
       "      <td>0</td>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "             date country countryCode province city  confirmed  suspected  \\\n",
       "10541  2020-02-14      美国          US      NaN  NaN         15          0   \n",
       "10542  2020-02-14      越南          VN      NaN  NaN         16          0   \n",
       "\n",
       "       cured  dead  \n",
       "10541      3     0  \n",
       "10542      5     0  "
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "DXYArea = pd.read_csv('../data/DXY_Chinese.csv') # Read Chinese version \n",
    "# select column\n",
    "DXYArea = DXYArea[['date','country','countryCode','province', 'city', 'confirmed', 'suspected', 'cured', 'dead']]\n",
    "\n",
    "DXYArea.tail(2)  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def isNaN(num):\n",
    "    return num != num\n",
    "\n",
    "## Resoruce for Chinese - English Translation\n",
    "with open('chineseProvince_to_EN.pkl','rb') as f:\n",
    "    prov_dict = pkl.load(f)\n",
    "\n",
    "    \n",
    "with open('chineseCity_to_EN.pkl','rb') as f:\n",
    "    city_dict = pkl.load(f)    \n",
    "        \n",
    "def translate_to_English(data, prov_dict, city_dict):\n",
    "    \"\"\"Translate Chinese in dataset to English\n",
    "    \"\"\"        \n",
    "    data['province'] = data['province'].apply(getProvinceTranslation)\n",
    "    data['city'] = data['city'].apply(getCityTranslation)\n",
    "    \n",
    "    for city in unable_translation: # remove these unable translated data\n",
    "        data = data[data['city']!=city]\n",
    "    return data\n",
    "    \n",
    "def getProvinceTranslation(name):\n",
    "    if not isNaN(name):\n",
    "        return prov_dict[name]\n",
    "    else: \n",
    "        return name\n",
    "\n",
    "unable_translation = []\n",
    "def getCityTranslation(name):\n",
    "    try:\n",
    "        if not isNaN(name): \n",
    "            return city_dict[name]\n",
    "        else:\n",
    "            return name\n",
    "    except:\n",
    "        unable_translation.append(name)\n",
    "        #print(name + ' cannot be translated\\n')\n",
    "        return name\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>date</th>\n",
       "      <th>country</th>\n",
       "      <th>countryCode</th>\n",
       "      <th>province</th>\n",
       "      <th>city</th>\n",
       "      <th>confirmed</th>\n",
       "      <th>suspected</th>\n",
       "      <th>cured</th>\n",
       "      <th>dead</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2019-12-01</td>\n",
       "      <td>中国</td>\n",
       "      <td>CN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2019-12-01</td>\n",
       "      <td>中国</td>\n",
       "      <td>CN</td>\n",
       "      <td>Hubei Province</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2019-12-01</td>\n",
       "      <td>中国</td>\n",
       "      <td>CN</td>\n",
       "      <td>Hubei Province</td>\n",
       "      <td>Wuhan</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         date country countryCode        province   city  confirmed  \\\n",
       "0  2019-12-01      中国          CN             NaN    NaN          1   \n",
       "1  2019-12-01      中国          CN  Hubei Province    NaN          1   \n",
       "2  2019-12-01      中国          CN  Hubei Province  Wuhan          1   \n",
       "\n",
       "   suspected  cured  dead  \n",
       "0          0      0     0  \n",
       "1          0      0     0  \n",
       "2          0      0     0  "
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "daily_frm_DXYArea = translate_to_English(DXYArea, prov_dict, city_dict)\n",
    "\n",
    "daily_frm_DXYArea.head(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "def add_days(DXYArea: pd.core.frame.DataFrame) -> pd.core.frame.DataFrame:\n",
    "    \"\"\"\n",
    "    Create a new column: Days, number of days after 2019-12-08 (detect the first case)\n",
    "    \"\"\"\n",
    "    DXYArea['date'] = pd.to_datetime(DXYArea['date'])\n",
    "    first_day = datetime.datetime(2019, 12, 8) # the time when detected the first case (2019-12-08)\n",
    "    DXYArea['Days'] = (DXYArea['date'] - first_day).dt.days\n",
    "    return DXYArea\n",
    "\n",
    "def add_net_confirmed_case(DXYArea: pd.core.frame.DataFrame)-> pd.core.frame.DataFrame:\n",
    "    \"\"\"\n",
    "    Add net confirmed case = confirmed - cured - dead\n",
    "    \"\"\"\n",
    "    DXYArea['net_confirmed'] = DXYArea['confirmed'] - DXYArea['cured'] - DXYArea['dead']\n",
    "    return DXYArea"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "daily_frm_DXYArea = add_days(daily_frm_DXYArea)  # add the number of days after 2019-12-08\n",
    "daily_frm_DXYArea = add_net_confirmed_case(daily_frm_DXYArea) # add net confirmed case"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>date</th>\n",
       "      <th>country</th>\n",
       "      <th>countryCode</th>\n",
       "      <th>province</th>\n",
       "      <th>city</th>\n",
       "      <th>confirmed</th>\n",
       "      <th>suspected</th>\n",
       "      <th>cured</th>\n",
       "      <th>dead</th>\n",
       "      <th>Days</th>\n",
       "      <th>net_confirmed</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2019-12-01</td>\n",
       "      <td>中国</td>\n",
       "      <td>CN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>-7</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2019-12-01</td>\n",
       "      <td>中国</td>\n",
       "      <td>CN</td>\n",
       "      <td>Hubei Province</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>-7</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2019-12-01</td>\n",
       "      <td>中国</td>\n",
       "      <td>CN</td>\n",
       "      <td>Hubei Province</td>\n",
       "      <td>Wuhan</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>-7</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2019-12-02</td>\n",
       "      <td>中国</td>\n",
       "      <td>CN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>-6</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2019-12-02</td>\n",
       "      <td>中国</td>\n",
       "      <td>CN</td>\n",
       "      <td>Hubei Province</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>-6</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        date country countryCode        province   city  confirmed  suspected  \\\n",
       "0 2019-12-01      中国          CN             NaN    NaN          1          0   \n",
       "1 2019-12-01      中国          CN  Hubei Province    NaN          1          0   \n",
       "2 2019-12-01      中国          CN  Hubei Province  Wuhan          1          0   \n",
       "3 2019-12-02      中国          CN             NaN    NaN          1          0   \n",
       "4 2019-12-02      中国          CN  Hubei Province    NaN          1          0   \n",
       "\n",
       "   cured  dead  Days  net_confirmed  \n",
       "0      0     0    -7              1  \n",
       "1      0     0    -7              1  \n",
       "2      0     0    -7              1  \n",
       "3      0     0    -6              1  \n",
       "4      0     0    -6              1  "
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "daily_frm_DXYArea.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "daily_frm_DXYArea.to_csv('../data/DXYArea.csv', index = None, header=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
